# -------------------------------------------------------------------
#       -*- coding: utf-8 -*-
#   @Project    :   spider
#   @File       :   21_douban250_re_csv.py
#   @Author     :   WANGYU
#   @Time       :   2021-08-17 09:12:27
#   @Software   :   PyCharm
#   @Desc       :   豆瓣TOP250爬虫
# -------------------------------------------------------------------


# 1.提取页面源代码 requests
import requests
# 2.利用正则表达式进一步提取 re
import re
# 3.引入数据格式文件
import csv

sum = 0
while sum < 250:
    url = f'https://movie.douban.com/top250?start={sum}&filter='
    sum += 25
    header = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) '
                      'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36'
    }
    resp = requests.get(url, headers=header)
    # print(resp.text)
    # 页面源代码
    page_content = resp.text
    # 解析数据(正则匹配)，re.S作用是让.具有匹配换行回车功能
    obj = re.compile(r'<li>.*?<span class="title">(?P<name>.*?)'
                     r'</span>.*?<p class="">.*?<br>(?P<year>.*?)'
                     r'&nbsp.*?<span class="rating_num" property="v:average">(?P<score>.*?)'
                     r'</span>.*?<span>(?P<num>.*?)人评价</span>', re.S)
    # 开始匹配
    result = obj.finditer(page_content)  # 返回迭代
    f = open('21_data.csv', mode='a')
    csv_write = csv.writer(f)
    # 遍历
    for item in result:
        dic = item.groupdict()
        dic['year'] = dic['year'].strip()
        csv_write.writerow(dic.values())
        # print(item.group('name'))
        # print(item.group('year').strip())
        # print(item.group('score'))
        # print(item.group('num'))
        # print('------------')
    f.close()
    print('over')

'''
        <li>
            <div class="item">
                <div class="pic">
                    <em class="">1</em>
                    <a href="https://movie.douban.com/subject/1292052/">
                        <img width="100" alt="肖申克的救赎" src="https://img2.doubanio.com/view/photo/s_ratio_poster/public/p480747492.webp" class="">
                    </a>
                </div>
                <div class="info">
                    <div class="hd">
                        <a href="https://movie.douban.com/subject/1292052/" class="">
                            <span class="title">肖申克的救赎</span>
                                    <span class="title">&nbsp;/&nbsp;The Shawshank Redemption</span>
                                <span class="other">&nbsp;/&nbsp;月黑高飞(港)  /  刺激1995(台)</span>
                        </a>


                            <span class="playable">[可播放]</span>
                    </div>
                    <div class="bd">
                        <p class="">
                            导演: 弗兰克·德拉邦特 Frank Darabont&nbsp;&nbsp;&nbsp;主演: 蒂姆·罗宾斯 Tim Robbins /...<br>
                            1994&nbsp;/&nbsp;美国&nbsp;/&nbsp;犯罪 剧情
                        </p>

                        
                        <div class="star">
                                <span class="rating5-t"></span>
                                <span class="rating_num" property="v:average">9.7</span>
                                <span property="v:best" content="10.0"></span>
                                <span>2424702人评价</span>
                        </div>

                            <p class="quote">
                                <span class="inq">希望让人自由。</span>
                            </p>
                    </div>
                </div>
            </div>
        </li>

'''
